This files contains an example of tuning a Logistic Regression model with BayesSearchCV
import pickle
import time
import helpsk as hlp
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
import plotly.io as pio
pio.renderers.default='notebook'
with open('../X_train.pkl', 'rb') as handle:
X_train = pickle.load(handle)
with open('../y_train.pkl', 'rb') as handle:
y_train = pickle.load(handle)
hlp.pandas.numeric_summary(X_train)
| # of Non-Nulls | # of Nulls | % Nulls | # of Zeros | % Zeros | Mean | St Dev. | Coef of Var | Skewness | Kurtosis | Min | 10% | 25% | 50% | 75% | 90% | Max | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| duration | 760 | 40 | 5.0% | 0 | 0.0% | 21.0 | 11.7 | 0.6 | 1.0 | 0.6 | 4.0 | 9.0 | 12.0 | 18.0 | 24.0 | 36.0 | 60.0 |
| credit_amount | 800 | 0 | 0.0% | 38 | 5.0% | 3,203.9 | 2,932.3 | 0.9 | 1.9 | 3.9 | 0.0 | 753.9 | 1,300.8 | 2,236.5 | 3,951.5 | 7,394.6 | 18,424.0 |
| installment_commitment | 800 | 0 | 0.0% | 0 | 0.0% | 3.0 | 1.1 | 0.4 | -0.5 | -1.2 | 1.0 | 1.0 | 2.0 | 3.0 | 4.0 | 4.0 | 4.0 |
| residence_since | 800 | 0 | 0.0% | 0 | 0.0% | 2.9 | 1.1 | 0.4 | -0.3 | -1.4 | 1.0 | 1.0 | 2.0 | 3.0 | 4.0 | 4.0 | 4.0 |
| age | 800 | 0 | 0.0% | 0 | 0.0% | 35.6 | 11.4 | 0.3 | 1.0 | 0.7 | 19.0 | 23.0 | 27.0 | 33.0 | 42.0 | 52.0 | 75.0 |
| existing_credits | 800 | 0 | 0.0% | 0 | 0.0% | 1.4 | 0.6 | 0.4 | 1.3 | 1.6 | 1.0 | 1.0 | 1.0 | 1.0 | 2.0 | 2.0 | 4.0 |
| num_dependents | 800 | 0 | 0.0% | 0 | 0.0% | 1.1 | 0.3 | 0.3 | 2.0 | 2.1 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 2.0 | 2.0 |
hlp.pandas.non_numeric_summary(X_train)
| # of Non-Nulls | # of Nulls | % Nulls | Most Freq. Value | # of Unique | % Unique | |
|---|---|---|---|---|---|---|
| checking_status | 763 | 37 | 4.6% | no checking | 4 | 0.5% |
| credit_history | 800 | 0 | 0.0% | existing paid | 5 | 0.6% |
| purpose | 800 | 0 | 0.0% | radio/tv | 10 | 1.2% |
| savings_status | 800 | 0 | 0.0% | <100 | 5 | 0.6% |
| employment | 800 | 0 | 0.0% | 1<=X<4 | 5 | 0.6% |
| personal_status | 800 | 0 | 0.0% | male single | 4 | 0.5% |
| other_parties | 800 | 0 | 0.0% | none | 3 | 0.4% |
| property_magnitude | 800 | 0 | 0.0% | car | 4 | 0.5% |
| other_payment_plans | 800 | 0 | 0.0% | none | 3 | 0.4% |
| housing | 800 | 0 | 0.0% | own | 3 | 0.4% |
| job | 800 | 0 | 0.0% | skilled | 4 | 0.5% |
| own_telephone | 800 | 0 | 0.0% | none | 2 | 0.2% |
| foreign_worker | 800 | 0 | 0.0% | yes | 2 | 0.2% |
y_train[0:10]
array([1, 1, 0, 1, 0, 1, 0, 1, 1, 0])
np.unique(y_train, return_counts=True)
(array([0, 1]), array([559, 241]))
np.unique(y_train, return_counts=True)[1] / np.sum(np.unique(y_train, return_counts=True)[1])
array([0.69875, 0.30125])
search_space = hlp.sklearn_search.LogisticBayesianSearchSpace(random_state=42)
# pip install scikit-optimize
from skopt import BayesSearchCV
#from skopt.space import Real, Categorical, Integer
from sklearn.model_selection import RepeatedKFold
bayes_search = BayesSearchCV(
estimator=search_space.pipeline(data=X_train),
search_spaces=search_space.search_spaces(),
cv=RepeatedKFold(n_splits=5, n_repeats=2),
scoring='roc_auc',
n_jobs=-1,
verbose=1,
random_state=42,
)
start_time = time.time()
bayes_search.fit(X_train, y_train)
elapsed_time = time.time() - start_time
print(f"Elapsed time to run BayesSearchCV: {elapsed_time:.3f} seconds; {elapsed_time / 60:.1f} minutes")
Elapsed time to run BayesSearchCV: 57.651 seconds; 1.0 minutes
print(bayes_search.best_score_)
0.7804183342518474
print(bayes_search.best_params_)
OrderedDict([('model', LogisticRegression(C=0.08541124686163197, max_iter=1000, random_state=42)), ('model__C', 0.08541124686163197), ('prep__non_numeric__encoder__transformer', OneHotEncoder(handle_unknown='ignore')), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())])
results = hlp.sklearn_eval.MLExperimentResults.from_sklearn_search_cv(
searcher=bayes_search,
higher_score_is_better = True,
parameter_name_mappings = search_space.param_name_mappings()
)
results.to_yaml_file(yaml_file_name = 'Run 1 - Logistic Regression - BayesSearchCV.yaml')
results = hlp.sklearn_eval.MLExperimentResults.from_yaml_file(yaml_file_name = 'Run 1 - Logistic Regression - BayesSearchCV.yaml')
results.best_score
0.7804183342518474
results.best_params
{'model': 'LogisticRegression()',
'C': 0.08541124686163197,
'imputer': 'SimpleImputer()',
'scaler': 'StandardScaler()',
'encoder': 'OneHotEncoder()'}
results.to_formatted_dataframe(num_rows=100, include_rank=True)
| rank | roc_auc Mean | roc_auc 95CI.LO | roc_auc 95CI.HI | C | imputer | scaler | encoder |
|---|---|---|---|---|---|---|---|
| 1 | 0.780 | 0.755 | 0.806 | 0.085 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 2 | 0.774 | 0.754 | 0.794 | 0.034 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 3 | 0.773 | 0.755 | 0.791 | 0.156 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 4 | 0.773 | 0.746 | 0.800 | 0.094 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 5 | 0.773 | 0.742 | 0.803 | 0.058 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 6 | 0.772 | 0.751 | 0.794 | 0.103 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 7 | 0.772 | 0.744 | 0.801 | 0.063 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
| 8 | 0.771 | 0.756 | 0.786 | 0.013 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
| 9 | 0.771 | 0.739 | 0.803 | 0.030 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
| 10 | 0.771 | 0.746 | 0.795 | 0.085 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
| 11 | 0.770 | 0.748 | 0.792 | 0.064 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
| 12 | 0.768 | 0.744 | 0.791 | 0.087 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
| 13 | 0.767 | 0.739 | 0.795 | 0.087 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 14 | 0.766 | 0.753 | 0.778 | 0.101 | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | OneHotEncoder() |
| 15 | 0.763 | 0.732 | 0.794 | 0.534 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 16 | 0.763 | 0.741 | 0.785 | 0.011 | SimpleImputer() | MinMaxScaler() | OneHotEncoder() |
| 17 | 0.762 | 0.740 | 0.785 | 0.476 | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | OneHotEncoder() |
| 18 | 0.760 | 0.739 | 0.781 | 95.839 | SimpleImputer() | MinMaxScaler() | OneHotEncoder() |
| 19 | 0.759 | 0.738 | 0.780 | 0.000 | SimpleImputer() | MinMaxScaler() | OneHotEncoder() |
| 20 | 0.758 | 0.741 | 0.775 | 0.000 | SimpleImputer() | MinMaxScaler() | OneHotEncoder() |
| 21 | 0.757 | 0.720 | 0.795 | 32.731 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 22 | 0.757 | 0.725 | 0.789 | 0.866 | SimpleImputer() | MinMaxScaler() | OneHotEncoder() |
| 23 | 0.756 | 0.722 | 0.790 | 0.010 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 24 | 0.756 | 0.729 | 0.783 | 0.000 | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | OneHotEncoder() |
| 25 | 0.756 | 0.724 | 0.788 | 0.000 | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | OneHotEncoder() |
| 26 | 0.756 | 0.749 | 0.763 | 0.000 | SimpleImputer() | MinMaxScaler() | OneHotEncoder() |
| 27 | 0.755 | 0.726 | 0.784 | 1.596 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
| 28 | 0.755 | 0.721 | 0.789 | <NA> | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 29 | 0.755 | 0.724 | 0.786 | 0.001 | SimpleImputer(strategy='median') | MinMaxScaler() | OneHotEncoder() |
| 30 | 0.754 | 0.721 | 0.786 | 0.004 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 31 | 0.753 | 0.729 | 0.777 | 0.001 | SimpleImputer() | MinMaxScaler() | OneHotEncoder() |
| 32 | 0.752 | 0.730 | 0.774 | 22.913 | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | OneHotEncoder() |
| 33 | 0.752 | 0.731 | 0.773 | 99.462 | SimpleImputer(strategy='median') | MinMaxScaler() | OneHotEncoder() |
| 34 | 0.751 | 0.739 | 0.764 | 0.031 | SimpleImputer() | MinMaxScaler() | OneHotEncoder() |
| 35 | 0.750 | 0.732 | 0.767 | 0.000 | SimpleImputer() | MinMaxScaler() | OneHotEncoder() |
| 36 | 0.747 | 0.721 | 0.774 | 22.376 | SimpleImputer(strategy='median') | MinMaxScaler() | OneHotEncoder() |
| 37 | 0.745 | 0.709 | 0.781 | 11.655 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
| 38 | 0.743 | 0.704 | 0.782 | 97.246 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 39 | 0.734 | 0.704 | 0.764 | 3.489 | SimpleImputer(strategy='median') | StandardScaler() | CustomOrdinalEncoder() |
| 40 | 0.733 | 0.707 | 0.760 | 0.000 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
| 41 | 0.733 | 0.703 | 0.763 | 98.761 | SimpleImputer(strategy='most_frequent') | StandardScaler() | CustomOrdinalEncoder() |
| 42 | 0.732 | 0.705 | 0.760 | 0.000 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 43 | 0.729 | 0.701 | 0.756 | 0.120 | SimpleImputer(strategy='most_frequent') | StandardScaler() | CustomOrdinalEncoder() |
| 44 | 0.728 | 0.702 | 0.754 | 0.403 | SimpleImputer(strategy='median') | MinMaxScaler() | CustomOrdinalEncoder() |
| 45 | 0.728 | 0.700 | 0.755 | 0.000 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
| 46 | 0.728 | 0.699 | 0.756 | 0.003 | SimpleImputer(strategy='median') | StandardScaler() | CustomOrdinalEncoder() |
| 47 | 0.723 | 0.703 | 0.744 | 100.000 | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | CustomOrdinalEncoder() |
| 48 | 0.722 | 0.694 | 0.750 | 0.000 | SimpleImputer(strategy='most_frequent') | StandardScaler() | CustomOrdinalEncoder() |
| 49 | 0.720 | 0.678 | 0.763 | 0.000 | SimpleImputer() | StandardScaler() | CustomOrdinalEncoder() |
| 50 | 0.709 | 0.688 | 0.730 | 0.001 | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | CustomOrdinalEncoder() |
| 51 | 0.699 | 0.667 | 0.730 | 0.000 | SimpleImputer() | MinMaxScaler() | CustomOrdinalEncoder() |
results.plot_performance_across_trials().show()
results.plot_performance_across_trials(size=None, color='C').show()
results.plot_performance_across_trials(size='C', color='scaler').show()
results.plot_parameter_values_across_trials().show()
results.plot_scatter_matrix(height=800, width=800 * hlp.plot.GOLDEN_RATIO).show()
results.plot_performance_numeric_params()
results.plot_parallel_coordinates().show()
results.plot_performance_non_numeric_params()
results.plot_score_vs_parameter(
parameter='C',
color='scaler'
)
results.plot_score_vs_parameter(
parameter='C',
color='encoder'
)
roc_auc Mean¶score_variable = results.primary_score_name + ' Mean'
score_dataframe = results.to_dataframe()
score_dataframe = score_dataframe.drop(columns=[x for x in score_dataframe.columns
if x not in [score_variable] + results.parameter_names])
score_dataframe.head()
| roc_auc Mean | C | imputer | scaler | encoder | |
|---|---|---|---|---|---|
| 32 | 0.780418 | 0.085411 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 24 | 0.773843 | 0.033766 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 27 | 0.772769 | 0.156134 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 33 | 0.772658 | 0.093997 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 25 | 0.772622 | 0.057627 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
cleaned_column_names = [''.join(e for e in x.replace(' ', '_') if e == '_' or e.isalnum()) for x in score_dataframe.columns.tolist()]
cleaned_column_names = dict(zip(score_dataframe.columns.tolist(), cleaned_column_names))
cleaned_column_names
{'roc_auc Mean': 'roc_auc_Mean',
'C': 'C',
'imputer': 'imputer',
'scaler': 'scaler',
'encoder': 'encoder'}
score_dataframe = score_dataframe.rename(columns=cleaned_column_names)
import statsmodels.formula.api as smf
y_column = 'roc_auc_Mean'
X_columns = score_dataframe.columns.tolist()
X_columns.remove(y_column)
X_columns = hlp.string.collapse(X_columns, separate=" + ", surround="")
formula = f"{y_column} ~ {X_columns}"
print(formula)
model = smf.ols(formula=formula, data = score_dataframe)
results = model.fit()
print(results.summary())
roc_auc_Mean ~ C + imputer + scaler + encoder
OLS Regression Results
==============================================================================
Dep. Variable: roc_auc_Mean R-squared: 0.642
Model: OLS Adj. R-squared: 0.601
Method: Least Squares F-statistic: 15.78
Date: Mon, 14 Feb 2022 Prob (F-statistic): 6.91e-09
Time: 17:14:42 Log-Likelihood: 153.89
No. Observations: 50 AIC: -295.8
Df Residuals: 44 BIC: -284.3
Df Model: 5
Covariance Type: nonrobust
======================================================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------------------------------------
Intercept 0.7188 0.005 132.173 0.000 0.708 0.730
imputer[T.SimpleImputer(strategy='median')] 0.0019 0.006 0.319 0.751 -0.010 0.014
imputer[T.SimpleImputer(strategy='most_frequent')] -0.0006 0.004 -0.165 0.869 -0.008 0.007
scaler[T.StandardScaler()] 0.0065 0.004 1.867 0.069 -0.001 0.014
encoder[T.OneHotEncoder()] 0.0366 0.005 8.127 0.000 0.028 0.046
C -2.423e-05 5.81e-05 -0.417 0.679 -0.000 9.29e-05
==============================================================================
Omnibus: 13.891 Durbin-Watson: 1.565
Prob(Omnibus): 0.001 Jarque-Bera (JB): 14.834
Skew: -1.191 Prob(JB): 0.000601
Kurtosis: 4.202 Cond. No. 149.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
import pandas as pd
scaler = StandardScaler()
#scaler.fit_transform(bayes_search_df)
numeric_columns = hlp.pandas.get_numeric_columns(score_dataframe)
non_numeric_columns = hlp.pandas.get_non_numeric_columns(score_dataframe)
print(numeric_columns)
print(non_numeric_columns)
numeric_pipeline = Pipeline([
('scaling', StandardScaler()),
])
transformations_pipeline = ColumnTransformer([
('numeric_pipeline', numeric_pipeline, numeric_columns),
('non_numeric_pipeline', 'passthrough', non_numeric_columns)
])
score_dataframe_transformed = transformations_pipeline.fit_transform(score_dataframe)
score_dataframe_transformed = pd.DataFrame(score_dataframe_transformed,
columns= numeric_columns + non_numeric_columns)
score_dataframe_transformed.head()
['roc_auc_Mean', 'C'] ['imputer', 'scaler', 'encoder']
| roc_auc_Mean | C | imputer | scaler | encoder | |
|---|---|---|---|---|---|
| 0 | 1.560741 | -0.396496 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 1 | 1.204322 | -0.398245 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 2 | 1.146114 | -0.3941 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 3 | 1.140102 | -0.396205 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 4 | 1.138179 | -0.397437 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
score_dataframe_transformed['roc_auc_Mean'] = score_dataframe_transformed['roc_auc_Mean'].astype('float')
score_dataframe_transformed['C'] = score_dataframe_transformed['C'].astype('float')
print(formula)
model = smf.ols(formula=formula,
data = score_dataframe_transformed)
results = model.fit()
print(results.summary())
roc_auc_Mean ~ C + imputer + scaler + encoder
OLS Regression Results
==============================================================================
Dep. Variable: roc_auc_Mean R-squared: 0.642
Model: OLS Adj. R-squared: 0.601
Method: Least Squares F-statistic: 15.78
Date: Mon, 14 Feb 2022 Prob (F-statistic): 6.91e-09
Time: 17:14:42 Log-Likelihood: -45.748
No. Observations: 50 AIC: 103.5
Df Residuals: 44 BIC: 115.0
Df Model: 5
Covariance Type: nonrobust
======================================================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------------------------------------
Intercept -1.7963 0.288 -6.234 0.000 -2.377 -1.216
imputer[T.SimpleImputer(strategy='median')] 0.1008 0.316 0.319 0.751 -0.536 0.737
imputer[T.SimpleImputer(strategy='most_frequent')] -0.0330 0.199 -0.165 0.869 -0.435 0.369
scaler[T.StandardScaler()] 0.3547 0.190 1.867 0.069 -0.028 0.738
encoder[T.OneHotEncoder()] 1.9849 0.244 8.127 0.000 1.493 2.477
C -0.0388 0.093 -0.417 0.679 -0.226 0.149
==============================================================================
Omnibus: 13.891 Durbin-Watson: 1.565
Prob(Omnibus): 0.001 Jarque-Bera (JB): 14.834
Skew: -1.191 Prob(JB): 0.000601
Kurtosis: 4.202 Cond. No. 6.98
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
coefficients = pd.DataFrame({
'feature': results.params.index,
'coefficient': results.params,
'p_value': results.pvalues,
})
coefficients = coefficients.query("feature != 'Intercept'")
coefficients['Stat Sig'] = coefficients['p_value'] <= 0.05
coefficients
| feature | coefficient | p_value | Stat Sig | |
|---|---|---|---|---|
| imputer[T.SimpleImputer(strategy='median')] | imputer[T.SimpleImputer(strategy='median')] | 0.100760 | 7.512338e-01 | False |
| imputer[T.SimpleImputer(strategy='most_frequent')] | imputer[T.SimpleImputer(strategy='most_frequen... | -0.032967 | 8.694383e-01 | False |
| scaler[T.StandardScaler()] | scaler[T.StandardScaler()] | 0.354726 | 6.852016e-02 | False |
| encoder[T.OneHotEncoder()] | encoder[T.OneHotEncoder()] | 1.984936 | 2.656842e-10 | True |
| C | C | -0.038767 | 6.788970e-01 | False |
score_variable
'roc_auc Mean'
px.bar(
data_frame=coefficients.reindex(coefficients['coefficient'].abs().sort_values(ascending=True).index),
y='feature',
x='coefficient',
color='Stat Sig',
title=f"Regression Coefficients of Hyper-parameters against '{score_variable}'",
height=600,
width=600*hlp.plot.GOLDEN_RATIO
)
from sklearn.inspection import permutation_importance
forest = bayes_search.best_estimator_['model']
start_time = time.time()
result = permutation_importance(
bayes_search.best_estimator_, X_train, y_train, n_repeats=10, random_state=42, n_jobs=2
)
elapsed_time = time.time() - start_time
print(f"Elapsed time to compute the importances: {elapsed_time:.3f} seconds")
feature_names = X_train.columns.to_list()
forest_importances = pd.Series(result.importances_mean, index=feature_names)
forest_importances = forest_importances.sort_values(ascending=False)
Elapsed time to compute the importances: 3.596 seconds
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=result.importances_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.set_size_inches(9, 6)
fig.tight_layout()
plt.show()
temp = X_train.copy()
temp['default'] = y_train
temp.groupby('foreign_worker').agg({'default': np.mean})
| default | |
|---|---|
| foreign_worker | |
| yes | 0.308290 |
| no | 0.107143 |
fig = px.box(
data_frame=temp,
y='age',
x='default',
# size=size_variable,
# color=color_variable,
# trendline='lowess',
# labels={
# score_variable: f"Average Cross Validation Score ({parser.primary_score_name})",
# },
# title=f"<b>{x_variable}</b> - Performance<br>" \
# f"<sup>Size of point corresponds to '{size_variable}'</sup>",
# custom_data=['labels'],
height=600,
width=600*hlp.plot.GOLDEN_RATIO
)
fig.show()
NOTE: foreign worker seems like it should be important but is ranked last in feature importance.